import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
pd.options.display.max_columns = 1000
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score,make_scorer, confusion_matrix
import pickle
from category_encoders.one_hot import OneHotEncoder
import plotly.figure_factory as ff
import shap
from externals.plot import feature_importance_plot, feature_importance_shap_plot, score_distribution_plot, confusion_matrix_plot, threshold_search_plot, probability_pdf_perf, probability_cdf_perf,label_distribution_plot
from externals.undersampler import undersample_majority
from externals.tuning import ParameterTuning
df = pd.read_csv("dataset/dataset.csv",sep = ";")
df = df.set_index("uuid")
df.shape
df.info()
# checking whether ids are unique or not
df.index.nunique() == df.shape[0]
# seperating actual production test data from labelled data
final_test_data = df[df.default.isnull()]
df = df.dropna(subset = ["default"])
df.default.value_counts(normalize = True)
y = df.default
X = df.drop("default",axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
I kept the preprocessing step simple, as the feature names are unknown and no explanation given. I encode categorical features as one hot and give them to the model.
cat_cols = [
'account_status',
'account_worst_status_0_3m',
'account_worst_status_12_24m',
'account_worst_status_3_6m',
'account_worst_status_6_12m',
'merchant_category',
'merchant_group',
'name_in_email',
'status_last_archived_0_24m',
'status_2nd_last_archived_0_24m',
'status_3rd_last_archived_0_24m',
'status_max_archived_0_6_months',
'status_max_archived_0_12_months',
'status_max_archived_0_24_months',
'worst_status_active_inv'
]
from category_encoders.one_hot import OneHotEncoder
ohe = OneHotEncoder(cat_cols)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)
# saved the one hot encoder object to feed the production preprocessing steps
with open("saved_properties/one_hot_encoder.pickle", 'wb') as handle:
pickle.dump(ohe, handle, protocol=pickle.HIGHEST_PROTOCOL)
sampling_rate = [1,.75,.5]
models = {}
train_data_ids = {}
results = {}
for i in sampling_rate: # due to class imbalance dataset, i'll try undersampling
X_train, y_train = undersample_majority(X_train,y_train, i) # this function is defined in externals/undersampler.py.
# only undersamples the majority class by taking sampling rate into account
tuning = ParameterTuning(X_train, y_train, X_test, y_test) # this class is defined in externals/tuning.py
clf = tuning.param_tuning(n_iter = 50, refit_metric = False,n_splits=3) # for dummy training, keep n_iter as 1
res_tmp, model_dict = tuning.cv_results()
# saving models, training data indexes and results
models[i] = model_dict
train_data_ids[i] = X_train.index.to_list()
results[i] = res_tmp
# saving models, train data indexes, results and test data as pickle for possible future evaluations
with open("saved_properties/models.pickle", 'wb') as handle:
pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("saved_properties/train_data_ids.pickle", 'wb') as handle:
pickle.dump(train_data_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("saved_properties/results.pickle", 'wb') as handle:
pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("saved_properties/test_data.pickle", 'wb') as handle:
pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL)
results[1]
results[.75]
results[.5]
# This is the model i choose for the production. The reason is explained in the following cell
final_model = models[.75]["auc"]
Model selection decision is a business decision and can be made using various custom metrics by making a profitability analysis. Opportunity cost, APR, cost of loss sale, cost of default should be taken into account while creating a metric to measure the model's profitability. What I cared about here was to find a model that minimizes sales loss and a model that captures at least 60% of the defaults. I choose the model with sampling rate 1 and trained over AUC, since it made 6% loss of sales and correctly classified 57% of defaults.
# saved the model for production
with open("saved_properties/final_model.pickle", 'wb') as handle:
pickle.dump(final_model, handle, protocol=pickle.HIGHEST_PROTOCOL)
I used a shap plot to understand how feature values affect the model. This plot will help me understand if there is a bias in the model. Since I don't know the meaning of the feature, it doesn't mean anything right now, but a data scientist who has business knowledge and features can check the meaningfulness of the model by looking at this graph.
To deduce from the graph, the increase in the values of the "avg_payment_span_0_12m" feature positively affects the model score, while the increase in the "num_arch_ok_12_24m" feature decreases the model score.
feature_importance_shap_plot(final_model,X_test)
As expected, our model tends to score higher on defaults. But this graph is in the normalized, so a sharp separation appears.
pr = pd.DataFrame(final_model.predict_proba(X_test)[:,1], index = X_test.index)
pr.columns = ["prob"]
pr["true"] = y_test
label_distribution_plot(pr)
Our precision values seem low due to class imbalance.
score_distribution_plot(final_model,X_test,y_test)
This graph shows the variation of tnr and tpr according to different thresholds. If it was necessary to set a threshold, it could be selected here according to the business decision.
threshold_search_plot(final_model,X_test,y_test)
Thresholding is useful in such problems. so I plotted confusion matrices to see model performance at different thresholds
for test_th in range(30,60,1):
confusion_matrix_plot(y_test, np.where(final_model.predict_proba(X_test)[:,1] >= test_th/100, 1, 0), "threshold" + str(test_th/100),(6,6))
If we had approached the problem differently and divided it into approval, rejection and gray areas, it would have been possible to calculate the threshold by looking at the metrics with different threshold values with the code below.
def double_threshold_report(clf, x_test, y_test,thl, thu):
prob = clf.predict_proba(x_test)
res = pd.DataFrame(prob[:,1], columns = ["score"])
res.index = y_test.index
res["true"] = y_test
res["pred"] = 1
idx = res[res["score"] < thl].index
res.loc[idx, "pred"] = 0
tmp = res[(res["score"] >= thu) | (res["score"] <= thl)]
tp = len(tmp[(tmp["pred"] == 1) & (tmp["true"] == 1)])
fp = len(tmp[(tmp["pred"] == 1) & (tmp["true"] == 0)])
fn = len(tmp[(tmp["pred"] == 0) & (tmp["true"] == 1)])
tn = len(tmp[(tmp["pred"] == 0) & (tmp["true"] == 0)])
print("roc_auc",roc_auc_score(tmp["true"], tmp["pred"]))
print("tp:",tp," fp:",fp," tn:",tn, " fn:",fn)
print("auto decision covarage:",(tp+fp+tn+fn)/len(res))
print("precision:",tp/(tp + fp))
print("neg_precision:",tn/(tn + fn))
print("1-precsision:",(1-tp/(tp + fp)))
print("test_size:",len(res))
print("recall:",tp/(tp + fn))
double_threshold_report(final_model,X_test,y_test,0.5,.9)
Run the following code on your terminal to get score of corresponding uuid It will be enough to change the uuid and send a request.
!curl -XPOST "https://5pwx4y80a8.execute-api.us-east-2.amazonaws.com/klarnaStage" -d '{"uuid":"6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7"}'
final_test_data = final_test_data.drop("default",axis = 1)
final_test_data = ohe.transform(final_test_data)
final_test_data = final_test_data[X_train.columns]
predictions = final_model.predict_proba(final_test_data)[:,1]
overall_test_data_results = pd.Series(predictions,index = final_test_data.index,name = "pd").reset_index()
overall_test_data_results.head()
overall_test_data_results.to_csv("test_predictions.csv",sep = ";")